Load Packages
library(Seurat)
## Attaching SeuratObject
## Attaching sp
library(magrittr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Load Data. This data contains only Endothelial cells. These have been selected from a larger object.
Samples include Lean, Obese and Unhealthy Obese this is “condition”
endo_cells <- readRDS(file ="/home/lucamannino/R_Projects/Intagration_endoCellAtlas/endo_cells.RDS")
We build a new UMAP plot to look at the data We use the integrated assay for this step
DefaultAssay(endo_cells) <- "integrated"
endo_cells <- FindNeighbors(endo_cells, dims = 1:25)
## Computing nearest neighbor graph
## Computing SNN
endo_cells <- RunUMAP(endo_cells, dims = 1:25)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 18:08:09 UMAP embedding parameters a = 0.9922 b = 1.112
## 18:08:09 Read 27325 rows and found 25 numeric columns
## 18:08:09 Using Annoy for neighbor search, n_neighbors = 30
## 18:08:09 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 18:08:14 Writing NN index file to temp file /tmp/RtmpZoJt7b/file21839d5a3e9d2b
## 18:08:14 Searching Annoy index using 1 thread, search_k = 3000
## 18:08:24 Annoy recall = 100%
## 18:08:25 Commencing smooth kNN distance calibration using 1 thread
## 18:08:26 Initializing from normalized Laplacian + noise
## 18:08:27 Commencing optimization for 200 epochs, with 1313600 positive edges
## 18:08:40 Optimization finished
endo_cells <- FindClusters(endo_cells, resolution = 0.8)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 27325
## Number of edges: 1089437
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8631
## Number of communities: 15
## Elapsed time: 8 seconds
There are several patients included in this study, the patients labels are recorded under endo_cells@meta.data[[“orig.ident”]].
Patients have been categorized by their condition: Obese, Unhelthy Obese and Lean (endo_cells@meta.data[[“condition”]]).
From each patient subcutaneous and visceral cells were analysed, which cells belong to which tissue is recorded under endo_cells@meta.data[[“type”]]
DimPlot(endo_cells, label=TRUE, group.by = "type")
DimPlot(endo_cells, label=TRUE, group.by = "condition")
DimPlot(endo_cells, label=TRUE, group.by = "orig.ident")
In the following plot we look at specific markers to further clean the object from non endothelial cells For this we use the SCT assay
DefaultAssay(endo_cells) <- "SCT"
FeaturePlot(endo_cells, c("PECAM1","CDH5","CLU","VWF"))
DotPlot(endo_cells, features = c("PECAM1","CDH5","CLU","VWF"), assay="SCT")
DotPlot(endo_cells, features = c("FLRT2","ACSL4","COL8A1","CACNB2"), assay="SCT")
There are several patients included in this study, the patients labels are recorded under endo_cells@meta.data[[“orig.ident”]].
Patients have been categorized by their condition: Obese, Unhelthy Obese and Lean (endo_cells@meta.data[[“condition”]]).
From each patient subcutaneous and visceral cells were analysed, which cells belong to which tissue is recorded under endo_cells@meta.data[[“type”]]
DimPlot(endo_cells, label=TRUE, group.by = "type")
DimPlot(endo_cells, label=TRUE, group.by = "condition")
DimPlot(endo_cells, label=TRUE, group.by = "orig.ident")
we remove clusters 0,1,2,13 as they are not endothelial
EndoCells1 <- subset(endo_cells, idents = c(3,4,5,6,7,8,9,10,11,12,14))
New Umap plot is produced
DefaultAssay(EndoCells1) <- "integrated"
EndoCells1 <- FindNeighbors(EndoCells1, dims = 1:20)
## Computing nearest neighbor graph
## Computing SNN
EndoCells1 <- RunUMAP(EndoCells1, dims = 1:20)
## 18:09:15 UMAP embedding parameters a = 0.9922 b = 1.112
## 18:09:15 Read 13643 rows and found 20 numeric columns
## 18:09:15 Using Annoy for neighbor search, n_neighbors = 30
## 18:09:15 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 18:09:18 Writing NN index file to temp file /tmp/RtmpZoJt7b/file21839d6e90b215
## 18:09:18 Searching Annoy index using 1 thread, search_k = 3000
## 18:09:23 Annoy recall = 100%
## 18:09:24 Commencing smooth kNN distance calibration using 1 thread
## 18:09:25 Initializing from normalized Laplacian + noise
## 18:09:25 Commencing optimization for 200 epochs, with 643354 positive edges
## 18:09:31 Optimization finished
EndoCells1 <- FindClusters(EndoCells1, resolution = 3.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 13643
## Number of edges: 558064
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7377
## Number of communities: 43
## Elapsed time: 1 seconds
Check for immune cells
DefaultAssay(EndoCells1) <- "integrated"
EndoCells1 <- FindClusters(EndoCells1, resolution = 1.0)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 13643
## Number of edges: 558064
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8496
## Number of communities: 15
## Elapsed time: 1 seconds
DefaultAssay(EndoCells1) <- "SCT"
FeaturePlot(EndoCells1, c("CD163","MRC1","PTPRC"), pt.size = 0.1,reduction ="umap",min.cutoff=0)
FeaturePlot(EndoCells1, c("CDH5","PECAM1","LYVE1","PROX1"), pt.size = 0.1,reduction ="umap",min.cutoff=0)
DimPlot(EndoCells1, label=TRUE)
DotPlot(EndoCells1,features = c("CD163","MRC1","PTPRC","CDH5","PECAM1","LYVE1","PROX1"), assay = "SCT",cols = c("blue", "red"))
DimPlot(EndoCells1, label=TRUE, group.by = "type")
DimPlot(EndoCells1, label=TRUE, group.by = "condition")
DimPlot(EndoCells1, label=TRUE, group.by = "orig.ident")
removing clusters 7 and 8 and 14
EndoCells2 <- subset(EndoCells1, idents = c(0,1,2,3,4,5,6,9,10,11,12,13))
In this step we decided to carry out a new SCT transform and a new integration in order to obtain better results. As we have removed a lot of cells from the original SCT and integration assay it is usefull to run the integration and the normalisation again.
To acheve this we split the object in its original samples, run SCT transform on the “SOUP” assay. We will then integrate the object by sample.
EndoCells2.list <- SplitObject(EndoCells2, split.by = "orig.ident")
for (i in 1:length(EndoCells2.list)) {
EndoCells2.list[[i]] <- SCTransform(EndoCells2.list[[i]], assay = "soup", variable.features.n = 3000, verbose = FALSE)
}
need to use: options(future.globals.maxSize = 2500 * 1024^2) in order to increase the maximum allowed size for the integration algorithm
options(future.globals.maxSize = 2500 * 1024^2)
EndoCells2.list.features <- SelectIntegrationFeatures(object.list = EndoCells2.list, nfeatures = 3000)
EndoCells2.list <- PrepSCTIntegration(object.list = EndoCells2.list, anchor.features = EndoCells2.list.features,
verbose = FALSE)
options(future.globals.maxSize = 2500 * 1024^2)
EndoCells2.list.anchors <- FindIntegrationAnchors(object.list = EndoCells2.list, normalization.method = "SCT",
anchor.features = EndoCells2.list.features, verbose = FALSE)
EndoCells2.list.integrated <- IntegrateData(anchorset = EndoCells2.list.anchors, normalization.method = "SCT",
verbose = FALSE)
We produce a new UMAP plot from the integrated data
DefaultAssay(EndoCells2.list.integrated) <- "integrated"
EndoCells2.list.integrated <- RunPCA(EndoCells2.list.integrated, npcs = 40, verbose = FALSE)
EndoCells2.list.integrated <- FindNeighbors(EndoCells2.list.integrated, dims = 1:20)
## Computing nearest neighbor graph
## Computing SNN
EndoCells2.list.integrated <- RunUMAP(EndoCells2.list.integrated, dims = 1:20)
## 18:24:24 UMAP embedding parameters a = 0.9922 b = 1.112
## 18:24:24 Read 11622 rows and found 20 numeric columns
## 18:24:24 Using Annoy for neighbor search, n_neighbors = 30
## 18:24:24 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 18:24:25 Writing NN index file to temp file /tmp/RtmpZoJt7b/file21839d2887844f
## 18:24:25 Searching Annoy index using 1 thread, search_k = 3000
## 18:24:29 Annoy recall = 100%
## 18:24:29 Commencing smooth kNN distance calibration using 1 thread
## 18:24:30 Initializing from normalized Laplacian + noise
## 18:24:30 Commencing optimization for 200 epochs, with 514826 positive edges
## 18:24:35 Optimization finished
EndoCells2.list.integrated <- FindClusters(EndoCells2.list.integrated, resolution = 0.30)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 11622
## Number of edges: 473471
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9103
## Number of communities: 8
## Elapsed time: 1 seconds
QC the new object
DefaultAssay(EndoCells2.list.integrated) <- "SCT"
DimPlot(EndoCells2.list.integrated, label=TRUE, group.by = "type")
DimPlot(EndoCells2.list.integrated, label=TRUE, group.by = "condition")
DimPlot(EndoCells2.list.integrated, label=TRUE, group.by = "orig.ident")
QC checks to see if we have any non endothelial cells left
DefaultAssay(EndoCells2.list.integrated) <- "SCT"
FeaturePlot(EndoCells2.list.integrated, c("CD163","MRC1","PTPRC"), pt.size = 0.1,reduction ="umap",min.cutoff=0)
FeaturePlot(EndoCells2.list.integrated, c("CDH5","PECAM1","LYVE1","PROX1"), pt.size = 0.1,reduction ="umap",min.cutoff=0)
FeaturePlot(EndoCells2.list.integrated, c("CDH5"), pt.size = 0.1,reduction ="umap",min.cutoff=0)
DimPlot(EndoCells2.list.integrated, label=TRUE)
DotPlot(EndoCells2.list.integrated,features = c("CD163","MRC1","PTPRC","CDH5","PECAM1", "VCAM1", "LYVE1","PROX1"), assay = "SCT",cols = c("blue", "red"))
We remove cluster 3 from the object
Strict2 <- subset(EndoCells2.list.integrated, idents = c(0,1,2,4,5,6,7))
DefaultAssay(Strict2) <- "integrated"
Strict2 <- FindNeighbors(Strict2, dims = 1:20)
## Computing nearest neighbor graph
## Computing SNN
Strict2 <- RunUMAP(Strict2, dims = 1:20)
## 18:24:51 UMAP embedding parameters a = 0.9922 b = 1.112
## 18:24:51 Read 10236 rows and found 20 numeric columns
## 18:24:51 Using Annoy for neighbor search, n_neighbors = 30
## 18:24:51 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 18:24:52 Writing NN index file to temp file /tmp/RtmpZoJt7b/file21839d15f20876
## 18:24:52 Searching Annoy index using 1 thread, search_k = 3000
## 18:24:55 Annoy recall = 100%
## 18:24:56 Commencing smooth kNN distance calibration using 1 thread
## 18:24:56 Initializing from normalized Laplacian + noise
## 18:24:57 Commencing optimization for 200 epochs, with 452802 positive edges
## 18:25:01 Optimization finished
Strict2 <- FindClusters(Strict2, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 10236
## Number of edges: 415950
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8695
## Number of communities: 12
## Elapsed time: 1 seconds
DefaultAssay(Strict2) <- "SCT"
DimPlot(Strict2, label=TRUE, group.by = "type")
DimPlot(Strict2, label=TRUE, group.by = "condition")
DimPlot(Strict2, label=TRUE, group.by = "orig.ident")
DimPlot(Strict2, label=TRUE)
DimPlot(Strict2, label=TRUE, split.by = "type")
FeaturePlot(Strict2, c("PROX1"), pt.size = 0.1,reduction ="umap",min.cutoff=0, split.by ="type" )
FeaturePlot(Strict2, c("CD163","MRC1","PTPRC"), pt.size = 0.1,reduction ="umap",min.cutoff=0)
FeaturePlot(Strict2, c("CDH5","PECAM1","LYVE1","PROX1"), pt.size = 0.1,reduction ="umap",min.cutoff=0)
FeaturePlot(Strict2, c("CDH5"), pt.size = 0.1,reduction ="umap",min.cutoff=0)
DimPlot(Strict2, label=TRUE)
DotPlot(Strict2,features = c("CD163","MRC1","PTPRC","CDH5","PECAM1","LYVE1","PROX1"), assay = "SCT",cols = c("blue", "red"))
To label the endothelial subclusters we use Find all marker function, and then use this marker genes to appropiately label subclusters
combined.sct <- PrepSCTFindMarkers(Strict2)
## Found 8 SCT models. Recorrecting SCT counts using minimum median counts: 1216.34073055455
combined.markers <- FindAllMarkers(combined.sct, assay = "SCT", verbose = FALSE, only.pos = TRUE, logfc.threshold = 0.35,)
## For a more efficient implementation of the Wilcoxon Rank Sum Test,
## (default method for FindMarkers) please install the limma package
## --------------------------------------------
## install.packages('BiocManager')
## BiocManager::install('limma')
## --------------------------------------------
## After installation of limma, Seurat will automatically use the more
## efficient implementation (no further action necessary).
## This message will be shown once per session
head(combined.markers, n = 15)
## p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene
## PKHD1L1 0 1.7696767 0.730 0.093 0 0 PKHD1L1
## EFNA5 0 1.5901618 0.689 0.179 0 0 EFNA5
## RELN 0 1.5071423 0.618 0.077 0 0 RELN
## PPFIBP1 0 1.4111746 0.817 0.403 0 0 PPFIBP1
## MMRN1 0 1.3923432 0.620 0.087 0 0 MMRN1
## PDE1A 0 1.3657783 0.576 0.093 0 0 PDE1A
## LINC02147 0 1.3305961 0.680 0.152 0 0 LINC02147
## LINC02208 0 1.2040092 0.572 0.114 0 0 LINC02208
## DOCK5 0 1.1863608 0.629 0.093 0 0 DOCK5
## NRG3 0 1.1822006 0.527 0.074 0 0 NRG3
## AL357507.1 0 1.1125211 0.385 0.058 0 0 AL357507.1
## PIEZO2 0 1.1002791 0.559 0.064 0 0 PIEZO2
## PRKG1 0 1.0905427 0.649 0.190 0 0 PRKG1
## MPP7 0 0.9822819 0.472 0.052 0 0 MPP7
## VAV3 0 0.9550643 0.589 0.097 0 0 VAV3
#We produce a table with top 50 marker genes per cluster to facilitate use
combined.markers %>%
group_by(cluster) %>%
slice_max(n = 50, order_by = avg_log2FC)
## # A tibble: 568 × 7
## # Groups: cluster [12]
## p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
## 1 0 1.77 0.73 0.093 0 0 PKHD1L1
## 2 0 1.59 0.689 0.179 0 0 EFNA5
## 3 0 1.51 0.618 0.077 0 0 RELN
## 4 0 1.41 0.817 0.403 0 0 PPFIBP1
## 5 0 1.39 0.62 0.087 0 0 MMRN1
## 6 0 1.37 0.576 0.093 0 0 PDE1A
## 7 0 1.33 0.68 0.152 0 0 LINC02147
## 8 0 1.20 0.572 0.114 0 0 LINC02208
## 9 0 1.19 0.629 0.093 0 0 DOCK5
## 10 0 1.18 0.527 0.074 0 0 NRG3
## # … with 558 more rows
#Following code is to produce a heatmap of the 20 top marker genes per cluster
combined.markers %>%
group_by(cluster) %>%
top_n(n = 20, wt = avg_log2FC) -> top20
DoHeatmap(combined.sct, features = top20$gene) + NoLegend()
## Warning in DoHeatmap(combined.sct, features = top20$gene): The following
## features were omitted as they were not found in the scale.data slot for the SCT
## assay: ZEB1, EXOC6B, LPP, PTEN, ABLIM1, PKHD1L1